Overview

The VDJ is integrated to the Seurat object using the metadata slot. This permits the VDJ features to be used in plotting and other Seurat functions such as finding gene markers.

The RScript for this workflow is available on github in the workshop respository.

R session setup

R packages

We will require Seurat to interact with the Seurat object generated from processing of the gene expression data. We will load the Seurat objects for the datasets from RDS. Once again will be be using the tidyverse packages for data manipulation, ggpubr for plot layouts and also adding stats to panels, ggsci for colour palettes and rstatix for generating ggpubr/ggplot2 compatible data for showing statistics on panels.

#installing R packages
#check if the package is already installed, if not, install it
if (sum(grepl("tidyverse", rownames(installed.packages()))) > 0) {
  print("tidyverse is already installed")
} else {
  install.packages("tidyverse")
}
## [1] "tidyverse is already installed"
if (sum(grepl("Seurat", rownames(installed.packages()))) > 0) {
  print("Seurat is already installed")
} else {
  install.packages("Seurat")
}
## [1] "Seurat is already installed"
if (sum(grepl("ggpubr", rownames(installed.packages()))) > 0) {
  print("ggpubr is already installed")
} else {
  install.packages("ggpubr")
}
## [1] "ggpubr is already installed"
if (sum(grepl("ggsci", rownames(installed.packages()))) > 0) {
  print("ggsci is already installed")
} else {
  install.packages("ggsci")
}
## [1] "ggsci is already installed"
if (sum(grepl("rstatix", rownames(installed.packages()))) > 0) {
  print("rstatix is already installed")
} else {
  install.packages("rstatix")
}
## [1] "rstatix is already installed"
#loading R packages
#tidyverse packages for data manipulation and plotting
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#in order to work with the seurat objects from the scRNA-seq
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
##      (status 2 uses the sf package in place of rgdal)
## 
## Attaching package: 'SeuratObject'
## 
## The following objects are masked from 'package:base':
## 
##     intersect, saveRDS
## 
## Loading Seurat v5 beta version 
## To maintain compatibility with previous workflows, new Seurat objects will use the previous object structure by default
## To use new Seurat v5 assays: Please run: options(Seurat.object.assay.version = 'v5')
#multi-panel plots
library(ggpubr)
#colour scales for scientific journals
library(ggsci)
#stats package for ggplot2
library(rstatix)
## 
## Attaching package: 'rstatix'
## 
## The following object is masked from 'package:stats':
## 
##     filter

Themes

#themes for plots
theme_custom <- function (base_size = 12) { 
  theme_bw(base_size=base_size) %+replace% 
    theme(panel.grid = element_blank(),
          panel.border = element_blank(),
          axis.line = element_line(colour="black"),
          axis.text.x = element_text(colour="black", size=base_size),
          axis.text.y = element_text(colour="black", size = base_size, hjust = 1),
          axis.title = element_text(colour="black", face="bold", size = base_size),
          strip.background = element_blank(),
          strip.text = element_text(colour="black", face="bold", size=base_size),
          legend.text = element_text(colour="black", size=base_size)
    )
}

theme_x_rotated <- function (base_size = 12) { 
  theme_custom(base_size=base_size) %+replace% 
    theme(axis.text.x = element_text(colour="black", angle = 90, hjust = 1, vjust = 0.5, size=base_size))
}

Loading datasets

For each dataset - PBMC and tumour - we need to load the B and T cell VDJ data that we have previously summarised at the cell barcode level and the Seurat objects from the GEX analysis.

For the T cell data we load the tab-delimited file output from joining the cellranger and IgBLAST data, but for the B cells we will use the tab-delimited file from the clone calling.

These files are available from the github repo but have also been pre-loaded into Posit.cloud:

#loading VDJ data
## loading form git hub
#b.data <- read_tsv("https://raw.githubusercontent.com/kjlj/scRNA-seq_VDJ/main/data/pbmc-tumour_Ig_cellranger_igblast_per-barcode_clns.tsv")
#t.data <- read_tsv("https://raw.githubusercontent.com/kjlj/scRNA-seq_VDJ/main/data/pbmc-tumour_TR_cellranger_igblast_per-barcode.tsv")
## loading from local/posit.cloud
b.data <- read_tsv("../data/pbmc-tumour_Ig_cellranger_igblast_per-barcode_clns.tsv")
## Rows: 6626 Columns: 264
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (118): barcode, contig_id_igh, chain_cellranger_igh, v_gene_cellranger_i...
## dbl (107): length_cellranger_igh, reads_cellranger_igh, umis_cellranger_igh,...
## lgl  (39): is_cell_cellranger_igh, high_confidence_cellranger_igh, full_leng...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t.data <- read_tsv("../data/pbmc-tumour_TR_cellranger_igblast_per-barcode.tsv")
## Rows: 7383 Columns: 264
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (119): barcode, contig_id_tra, chain_cellranger_tra, v_gene_cellranger_t...
## dbl (106): length_cellranger_tra, reads_cellranger_tra, umis_cellranger_tra,...
## lgl  (39): is_cell_cellranger_tra, high_confidence_cellranger_tra, full_leng...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#loading seurat objects
#loading from dropbox as large file need to extend the timeout
#options(timeout=600)
#pbmc.seurat.obj <- readRDS(url("https://www.dropbox.com/scl/fi/4s610vt1mgtmgibdvfsar/pbmc_seurat-without-VDJ-genes-azimuth.rds?rlkey=ftdkxi9mnxezhbb42dqftbqel&dl=1"))
#tumour.seurat.obj <- readRDS(url("https://www.dropbox.com/scl/fi/scik47zay4x27t4wxmo70/tumour_seurat-without-VDJ-genes-azimuth.rds?rlkey=z8ghoeoboaneniv82e2xywji3&dl=1"))

#loading from local/posit.cloud
pbmc.seurat.obj <- readRDS("../seurat_objects/pbmc_seurat-without-VDJ-genes-azimuth.rds")
tumour.seurat.obj <- readRDS("../seurat_objects/tumour_seurat-without-VDJ-genes-azimuth.rds")

Filtering VDJ data

Up until now we have treated the B and T VDJ data within the same dataset independently but they are now going to be united with the scRNA-seq data. This requires that for each dataset, each cell barcode has only a single entry - either B or T VDJ data. There shouldn’t be cells with both B and T VDJ for the same cell barcode, but doublets, which haven’t been filtered from the VDJ, can result in this artifact.

Cell barcodes that have both B and T VDJ data need to be identified so that they can be removed:

#check for any cell barcodes that have both B and T cell data, likely to represent doublets and should be filtered
dual.bt.summary <- bind_rows(b.data %>% 
                               select(barcode, dataset) %>% 
                               mutate(chain = "B"),
                             t.data %>%
                               select(barcode, dataset) %>%
                               mutate(chain = "T")) %>%
  pivot_wider(id_cols = c("barcode", "dataset"),
              names_from = "chain",
              values_from = "chain",
              values_fn = length) %>%
  replace(is.na(.), 0) %>%
  mutate(dual_cell = ifelse(B == 1 & T == 1, TRUE, FALSE))

#get count for the number of cells with both B and T cell data from each dataset
with(dual.bt.summary, table(dataset, dual_cell))
##         dual_cell
## dataset  FALSE TRUE
##   pbmc    6116   96
##   tumour  7365  168

Add the dual_cell column to the B and T data and filter the cells that are TRUE:

b.data <- b.data %>%
  left_join(dual.bt.summary %>% select(-c(T, B)), by = c("barcode", "dataset")) %>%
  filter(dual_cell == FALSE)

table(b.data$dataset)
## 
##   pbmc tumour 
##    886   5476
t.data <- t.data %>%
  left_join(dual.bt.summary %>% select(-c(T, B)), by = c("barcode", "dataset")) %>%
  filter(dual_cell == FALSE)

table(t.data$dataset)
## 
##   pbmc tumour 
##   5230   1889
rm(dual.bt.summary)

There may also be VDJ cell barcodes for which there is no corresponding data in the scRNA-seq due to the cell being removed during filtering. We will drop VDJ cell barcodes that don’t have data in the GEX datasets.

To remove the cells, we need a list of the cell barcodes for the PBMC and tumour samples from the Seurat objects. This can be obtained from the rownames of the meta.data slot in each of the objects:

#get the barcodes for the PBMCs
pbmc.barcodes <- tibble(barcode = rownames(pbmc.seurat.obj@meta.data)) %>%
  mutate(dataset = "pbmc",
         in_gex = TRUE)

#get the barcodes for the tumour
tumour.barcodes <- tibble(barcode = rownames(tumour.seurat.obj@meta.data)) %>%
  mutate(dataset = "tumour",
         in_gex = TRUE)

#combine, the dataset col keeps things unique
barcodes <- bind_rows(pbmc.barcodes, tumour.barcodes)

#join with the B cell data
b.data <- b.data %>%
  left_join(barcodes) %>%
  mutate(in_gex = ifelse(is.na(in_gex), FALSE, TRUE))
## Joining with `by = join_by(barcode, dataset)`
#join with the T cell data
t.data <- t.data %>%
  left_join(barcodes) %>%
  mutate(in_gex = ifelse(is.na(in_gex), FALSE, TRUE))
## Joining with `by = join_by(barcode, dataset)`
#print the number of cell barcodes that overlap with the GEX
with(t.data, table(dataset, in_gex))
##         in_gex
## dataset  FALSE TRUE
##   pbmc     162 5068
##   tumour   183 1706
with(b.data, table(dataset, in_gex))
##         in_gex
## dataset  FALSE TRUE
##   pbmc      46  840
##   tumour   571 4905
#removing data no longer needed to save memory due to size of seurat objects
rm(list = c("pbmc.barcodes", "tumour.barcodes", "barcodes"))

Adding additional data

There are some additional data fields that could be of interest for anlaysis that we can add to the B and T cell datasets.

For the B cells, remove the cells that aren’t in the GEX and add columns for paired chains, clone size, whether or not the cell is part of an expanded clone and percent of somatic hypermutation and the isotype:

#add extra data fields to the b cell data
b.data <- b.data %>%
  filter(in_gex) %>% #remove VDJ for cells that aren't in the GEX
  mutate(bcr_paired = ifelse(!(is.na(v_call_igblast_igh)) & !(is.na(v_call_igblast_igkl)), TRUE, FALSE)) %>% #add a column to indicate if the b cells include paired chains
  group_by(dataset, clone_id, unq_clone_id) %>% #group cells by their clone ids within each dataset in order to add columns for # cells in a clone and whether or not a clone is expanded
  mutate(cln_size = length(barcode)) %>% #add the size for each clone, using mutate rather than summarise
  ungroup() %>% #remove the grouping
  mutate(b_expanded_cln = ifelse(cln_size == 1, "singleton", "expanded")) %>% #add a column indicating if the clone is expanded based on the cell cnt for the clone
  mutate(vh_mut = ifelse(is.na(v_identity_igblast_igh), NA_integer_, 100 - v_identity_igblast_igh),
         vkl_mut = ifelse(is.na(v_identity_igblast_igkl), NA_integer_, 100 - v_identity_igblast_igkl),
         b_mut_class = ifelse(vh_mut > 2 | vkl_mut > 2, "mutated", "unmutated")) %>% #for the Ig data add the SHM & whether a cell's Ig is mutated (using 2% threshold)
  mutate(isotype = ifelse(is.na(c_call_igblast_igh), NA_character_, str_replace(c_call_igblast_igh, "\\*.+$", ""))) #for the Ig data add the isotype subclass

#number of CELLs in expanded clones
with(b.data, table(dataset, b_expanded_cln))
##         b_expanded_cln
## dataset  expanded singleton
##   pbmc         12       828
##   tumour      182      4723
#number of cells that are mutated vs unmutated
with(b.data, table(dataset, b_mut_class))
##         b_mut_class
## dataset  mutated unmutated
##   pbmc       169       665
##   tumour    1145      3683
#number of cells for each isotype subclass
with(b.data, table(dataset, isotype))
##         isotype
## dataset  IGHA1 IGHA2 IGHD IGHE IGHG1 IGHG1,IGHG2 IGHG2 IGHG3 IGHG4 IGHM
##   pbmc      30     7   12    0    32           0    15     5     0  731
##   tumour   197    52  345    1   240           4   168    45     1 3713

For the T cells, removing the cells that don’t have the barcodes in GEX, and then adding paired clonotype label, clone size and whether or not the cell is part of a clonal expansion:

#add additional fields to the t cell data
t.data <- t.data %>%
  filter(in_gex) %>% #removing cells that aren't in the GEX
  mutate(tcr_paired = ifelse( !(is.na(clonotype_tra)) & !(is.na(clonotype_trb)), TRUE, FALSE)) %>% #add a column to indicate if the t cells include paired chains
  mutate(paired_clonotype = case_when(is.na(clonotype_tra) & !(is.na(clonotype_trb)) ~ paste(clonotype_trb),
                                      !(is.na(clonotype_tra)) & is.na(clonotype_trb) ~ paste(clonotype_tra),
                                      !(is.na(clonotype_tra)) & !(is.na(clonotype_trb)) ~ paste(clonotype_trb, clonotype_tra, sep = ":"),
                                      TRUE ~ NA_character_)) %>% #for the t cell add a clonotype label that joins both the TRA and TRB, the clone clustering for the Ig already considered the IGH and IGKL
  group_by(dataset, paired_clonotype) %>% #group cells by their paired clonotype labels to add cols for # cells in each clonotype and whether clonotype is expanded
  mutate(cln_size = length(barcode)) %>% #add the size for each clone, using mutate rather than summarise
  ungroup() %>% #remove the grouping
  mutate(t_expanded_cln = ifelse(cln_size == 1, "singleton", "expanded")) %>% #add a column indicating if the clone is expanded based on the cell cnt for the clone
  mutate(isotype = ifelse(is.na(c_call_igblast_trb), NA_character_, str_replace(c_call_igblast_trb, "\\*.+$", ""))) #adding the TRB constant region type, not actually isotype but for plotting same col name

#number of CELLs in expanded clones
with(t.data, table(dataset, t_expanded_cln))
##         t_expanded_cln
## dataset  expanded singleton
##   pbmc        125      4943
##   tumour       45      1661
with(t.data, table(dataset, isotype))
##         isotype
## dataset  TRBC1 TRBC2
##   pbmc    2197  2787
##   tumour   602  1050

Ranks for the expanded clones/clonotypes can also be added (where #1 is the largest clone/clonotype in the dataset):

## adding a rank for the clones/clonotypes
b.cln.rnks <- b.data %>%
  filter(!(is.na(unq_clone_id))) %>%
  group_by(dataset, unq_clone_id) %>%
  summarise(cln_size = length(barcode)) %>%
  ungroup() %>%
  mutate(expanded = ifelse(cln_size > 1, TRUE, FALSE)) %>%
  filter(expanded) %>%
  group_by(dataset) %>%
  mutate(b_cln_rnk = min_rank(desc(cln_size))) %>% 
  ungroup() %>%
  arrange(b_cln_rnk)
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
b.data <- b.data %>%
  left_join(b.cln.rnks %>% select(dataset, unq_clone_id, b_cln_rnk),
            by = c("unq_clone_id", "dataset"))

rm(b.cln.rnks)

t.cln.rnks <- t.data %>%
  filter(!(is.na(paired_clonotype))) %>%
  group_by(dataset, paired_clonotype) %>%
  summarise(cln_size = length(barcode)) %>%
  ungroup() %>%
  mutate(expanded = ifelse(cln_size > 1, TRUE, FALSE)) %>%
  filter(expanded) %>%
  group_by(dataset) %>%
  mutate(t_cln_rnk = min_rank(desc(cln_size))) %>% 
  ungroup() %>%
  arrange(t_cln_rnk)
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
t.data <- t.data %>%
  left_join(t.cln.rnks %>% select(dataset, paired_clonotype, t_cln_rnk),
            by = c("paired_clonotype", "dataset"))

rm(t.cln.rnks)

Adding metadata to Seurat object

The B and T cell data can now be formatted to add to the Seurat objects for the scRNA-seq analysis. The join will happen via the cell barcode and here we are selecting the subset of columns that want to add into the Seurat meta.data. This list can be amended as needed:

## this can be altered
b.metadata <- b.data %>%
  select(barcode, dataset, v_igh, j_igh, v_igkl, j_igkl, unq_clone_id, bcr_paired, b_cln_rnk, b_expanded_cln,
         vh_mut, vkl_mut, b_mut_class, isotype)

t.metadata <- t.data %>%
  select(barcode, dataset, v_tra, j_tra, v_trb, j_trb, paired_clonotype, tcr_paired, t_cln_rnk, t_expanded_cln, isotype)

To add the metadata need to convert from a tibble to a data.frame and move the barcode to the row names. We also need to split the PBMC and tumour to separate data frames:

##PBMC
#create a data.frame from combining the B and T cell data for the PBMC dataset
#adding an extra column after bring the data together to indicate whether a cell has B VDJ, T VDJ or neither
pbmc.metadata <- as.data.frame(b.metadata %>% filter(dataset == "pbmc") %>%
                                 full_join(t.metadata %>% filter(dataset == "pbmc")) %>%
                                 mutate(vdj_cell_type = case_when(!(is.na(paired_clonotype)) ~ "T",
                                                                  !(is.na(unq_clone_id)) ~ "B",
                                                                  TRUE ~ NA_character_)))
## Joining with `by = join_by(barcode, dataset, isotype)`
#add the barcodes as the row names for the data.frame
rownames(pbmc.metadata) <- pbmc.metadata$barcode

#remove the barcode column
pbmc.metadata <- pbmc.metadata %>% select(-barcode)

Adding PBMC metadata:

#add the metadata to the seurat object, as the rownames are the cell barcodes this will ensure everything gets matched up!
pbmc.seurat.obj <- AddMetaData(pbmc.seurat.obj, pbmc.metadata)

#check the meta data cols now in the seurat object
colnames(pbmc.seurat.obj@meta.data)
##  [1] "orig.ident"       "nCount_RNA"       "nFeature_RNA"     "nCount_SCT"      
##  [5] "nFeature_SCT"     "SCT_snn_res.0.8"  "seurat_clusters"  "SCT_snn_res.0.7" 
##  [9] "SCT_snn_res.0.6"  "SCT_snn_res.0.5"  "SCT_snn_res.0.4"  "SCT_snn_res.0.3" 
## [13] "SCT_snn_res.0.2"  "azimuth_l1"       "azimuth_l2"       "azimuth_l3"      
## [17] "dataset"          "v_igh"            "j_igh"            "v_igkl"          
## [21] "j_igkl"           "unq_clone_id"     "bcr_paired"       "b_cln_rnk"       
## [25] "b_expanded_cln"   "vh_mut"           "vkl_mut"          "b_mut_class"     
## [29] "isotype"          "v_tra"            "j_tra"            "v_trb"           
## [33] "j_trb"            "paired_clonotype" "tcr_paired"       "t_cln_rnk"       
## [37] "t_expanded_cln"   "vdj_cell_type"
#can remove the data frame once it is in the seurat object
rm(list = c("pbmc.metadata"))

Repeating for the tumour dataset:

##tumour
#create a data.frame from B and T data for the tumour dataset
tumour.metadata <- as.data.frame(b.metadata %>% filter(dataset == "tumour") %>%
                                   full_join(t.metadata %>% filter(dataset == "tumour")) %>%
                                   mutate(vdj_cell_type = case_when(!(is.na(paired_clonotype)) ~ "T",
                                                                    !(is.na(unq_clone_id)) ~ "B",
                                                                    TRUE ~ NA_character_)))
## Joining with `by = join_by(barcode, dataset, isotype)`
#add the barcodes as the row names for the data.frame
rownames(tumour.metadata) <- tumour.metadata$barcode

#remove the barcode column
tumour.metadata <- tumour.metadata %>% select(-barcode)

#add the metadata to the seurat object, as the rownames are the cell barcodes this will ensure everything gets matched up!
tumour.seurat.obj <- AddMetaData(tumour.seurat.obj, tumour.metadata)

#check the meta data cols now in the seurat object
colnames(tumour.seurat.obj@meta.data)
##  [1] "orig.ident"       "nCount_RNA"       "nFeature_RNA"     "nCount_SCT"      
##  [5] "nFeature_SCT"     "SCT_snn_res.0.8"  "seurat_clusters"  "SCT_snn_res.0.7" 
##  [9] "SCT_snn_res.0.6"  "SCT_snn_res.0.5"  "SCT_snn_res.0.4"  "SCT_snn_res.0.3" 
## [13] "SCT_snn_res.0.2"  "azimuth_l1"       "azimuth_l2"       "azimuth_l3"      
## [17] "dataset"          "v_igh"            "j_igh"            "v_igkl"          
## [21] "j_igkl"           "unq_clone_id"     "bcr_paired"       "b_cln_rnk"       
## [25] "b_expanded_cln"   "vh_mut"           "vkl_mut"          "b_mut_class"     
## [29] "isotype"          "v_tra"            "j_tra"            "v_trb"           
## [33] "j_trb"            "paired_clonotype" "tcr_paired"       "t_cln_rnk"       
## [37] "t_expanded_cln"   "vdj_cell_type"
#remove objects that we no longer require
rm(list = c("tumour.metadata", "b.metadata", "t.metadata"))

Plotting using VDJ metadata

Now that the VDJ features are in the meta.data slot for seurat objects they can be used for plotting.

For example, highlighting which cells have B or T VDJs on a UMAP:

#plot the metadata onto the umap within Seurat
p1 <- DimPlot(pbmc.seurat.obj, group.by = "vdj_cell_type", cols = pal_d3()(2),  na.value = "lightgray")
plot(p1)

Or, plotting which cells are using different isotype subclasses or TRBC genes:

p1 <- DimPlot(pbmc.seurat.obj, group.by = "isotype", cols = pal_d3("category20")(10), na.value = "lightgray")
plot(p1)

Plotting which clones are expanded versus singleton:

p1 <- DimPlot(subset(pbmc.seurat.obj, vdj_cell_type == "B"), 
              group.by = "b_expanded_cln", 
              cols = pal_d3("category20")(10), 
              pt.size = 0.8,
              na.value = "lightgray")
## Warning: Removing 3466 cells missing data for vars requested
plot(p1)

Plotting B cell clones based on their clone rank:

p1 <- DimPlot(subset(pbmc.seurat.obj, vdj_cell_type == "B"), 
              group.by = c("b_cln_rnk"), 
              cols = pal_d3("category20")(10), 
              pt.size = 0.8,
              na.value = "lightgray")
## Warning: Removing 3466 cells missing data for vars requested
plot(p1)

Some examples for the tumour dataset:

#tumour
p1 <- DimPlot(tumour.seurat.obj, group.by = "vdj_cell_type", cols = pal_d3()(2),  na.value = "lightgray")
plot(p1)

p1 <- DimPlot(tumour.seurat.obj, group.by = "isotype", cols = pal_d3("category20")(20), na.value = "lightgray")
plot(p1)

The use of the VDJ meta data is not limited to UMAPS. It can also be used for other plots types.

For example, we can look at gene expression of CD3 for each seurat cluster for T cell clonotypes that are either expanded or singleton:

#can use the VDJ data when exploring GEX too
p1 <- VlnPlot(tumour.seurat.obj, group.by = "seurat_clusters", split.by = "t_expanded_cln", features = c("CD3E"))
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##       
## This message will be shown once per session.
## Warning: Removing 5795 cells missing data for vars requested
plot(p1)

Or, showing relative gene expression per cell for CD3 for on a feature plot:

p1 <- FeaturePlot(tumour.seurat.obj, split.by = "t_expanded_cln", features = c("CD3E"))
plot(p1)
## Warning: Removed 5795 rows containing missing values (`geom_point()`).
## Removed 5795 rows containing missing values (`geom_point()`).

#remove the plot object and run garbage collection 
rm(p1)
gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells   4427651  236.5    7495724  400.4         NA   7495724  400.4
## Vcells 286010982 2182.1  415452225 3169.7     102400 325964872 2487.0

Plotting outside of Seurat

Plotting within Seurat can lack some flexibility, so in addition to importing data to the Seurat object, we can also export data from Seurat objects for use with tidyverse (or base R) functions. In this example, we will export data from the meta.data and the UMAP coordinates slots, but you can also export the normalised gene expression data using the GetAssayData() function.

For the PBMC sample:

## PBMCs
#metadata slot
pbmc.seurat.meta <- as.data.frame(pbmc.seurat.obj@meta.data) %>% 
  rownames_to_column("barcode") 

#UMAP coords
pbmc.seurat.umap <- as.data.frame(pbmc.seurat.obj@reductions$umap@cell.embeddings) %>%
  rownames_to_column("barcode")

#combining
pbmc.seurat.data <- pbmc.seurat.meta %>%
  left_join(pbmc.seurat.umap, by = "barcode") %>%
  as_tibble()

pbmc.seurat.data %>% head()
## # A tibble: 6 × 41
##   barcode            orig.ident  nCount_RNA nFeature_RNA nCount_SCT nFeature_SCT
##   <chr>              <fct>            <dbl>        <int>      <dbl>        <int>
## 1 AAACCTGAGACAGACC-1 SeuratProj…       4751         1982       5264         1959
## 2 AAACCTGAGCGATAGC-1 SeuratProj…       5750         1759       5735         1745
## 3 AAACCTGAGCGGCTTC-1 SeuratProj…       4991         1957       5384         1939
## 4 AAACCTGAGGATCGCA-1 SeuratProj…       6011         2309       5898         2290
## 5 AAACCTGAGTCACGCC-1 SeuratProj…       7249         2312       6248         2295
## 6 AAACCTGCACGTCAGC-1 SeuratProj…       5660         1946       5687         1932
## # ℹ 35 more variables: SCT_snn_res.0.8 <fct>, seurat_clusters <fct>,
## #   SCT_snn_res.0.7 <fct>, SCT_snn_res.0.6 <fct>, SCT_snn_res.0.5 <fct>,
## #   SCT_snn_res.0.4 <fct>, SCT_snn_res.0.3 <fct>, SCT_snn_res.0.2 <fct>,
## #   azimuth_l1 <chr>, azimuth_l2 <chr>, azimuth_l3 <chr>, dataset <chr>,
## #   v_igh <chr>, j_igh <chr>, v_igkl <chr>, j_igkl <chr>, unq_clone_id <chr>,
## #   bcr_paired <lgl>, b_cln_rnk <int>, b_expanded_cln <chr>, vh_mut <dbl>,
## #   vkl_mut <dbl>, b_mut_class <chr>, isotype <chr>, v_tra <chr>, …
rm(list = c("pbmc.seurat.meta", "pbmc.seurat.umap"))

For the tumour sample:

## Tumour
tumour.seurat.meta <- as.data.frame(tumour.seurat.obj@meta.data) %>% 
  rownames_to_column("barcode") 

tumour.seurat.umap <- as.data.frame(tumour.seurat.obj@reductions$umap@cell.embeddings) %>%
  rownames_to_column("barcode")

tumour.seurat.data <- tumour.seurat.meta %>%
  left_join(tumour.seurat.umap, by = "barcode") %>%
  as_tibble()

tumour.seurat.data %>% head()
## # A tibble: 6 × 41
##   barcode            orig.ident  nCount_RNA nFeature_RNA nCount_SCT nFeature_SCT
##   <chr>              <fct>            <dbl>        <int>      <dbl>        <int>
## 1 AAACCTGAGATCCTGT-1 SeuratProj…       5121         1648       6232         1631
## 2 AAACCTGAGCGATAGC-1 SeuratProj…       7212         1918       6915         1906
## 3 AAACCTGAGGAGTCTG-1 SeuratProj…       5903         1945       6401         1940
## 4 AAACCTGCAGCGTAAG-1 SeuratProj…       7226         2348       6927         2328
## 5 AAACCTGCAGTATAAG-1 SeuratProj…      14042         3363       7489         2861
## 6 AAACCTGCATCTGGTA-1 SeuratProj…      23428         5020       6760         2422
## # ℹ 35 more variables: SCT_snn_res.0.8 <fct>, seurat_clusters <fct>,
## #   SCT_snn_res.0.7 <fct>, SCT_snn_res.0.6 <fct>, SCT_snn_res.0.5 <fct>,
## #   SCT_snn_res.0.4 <fct>, SCT_snn_res.0.3 <fct>, SCT_snn_res.0.2 <fct>,
## #   azimuth_l1 <chr>, azimuth_l2 <chr>, azimuth_l3 <chr>, dataset <chr>,
## #   v_igh <chr>, j_igh <chr>, v_igkl <chr>, j_igkl <chr>, unq_clone_id <chr>,
## #   bcr_paired <lgl>, b_cln_rnk <int>, b_expanded_cln <chr>, vh_mut <dbl>,
## #   vkl_mut <dbl>, b_mut_class <chr>, isotype <chr>, v_tra <chr>, …
#remove the seurat objects now that the necessary data has been dumped
rm(list = c("tumour.seurat.meta", "tumour.seurat.umap",
            "tumour.seurat.obj", "pbmc.seurat.obj"))
#force garbage collection
gc()
##            used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells  4404008 235.2    7495724  400.4         NA   7495724  400.4
## Vcells 17012281 129.8  332361780 2535.8     102400 325964872 2487.0

We now have greater control over the plotting and the format of the plots and as a bounus we have freed up a lot of memory by unloading the Seurat objects.

An example of plotting SHM for each B cell on a UMAP plot:

#an example of plotting B cell SHM levels on the UMAP
pbmc.shm.umap.p1 <- ggplot(pbmc.seurat.data, aes(x = umap_1, y = umap_2)) +
  geom_point(data = pbmc.seurat.data %>% filter(vdj_cell_type != "B"),
             size = 0.5,
             colour = "lightgray") + #plotting cells that are NOT classed as B cell from VDJ on the bottom layer of the plot with a smaller point size
  geom_point(data = pbmc.seurat.data %>% filter(vdj_cell_type == "B"),
             size = 1,
             aes(colour = vh_mut)) + #plotting cells that are classed as B cells on top layer and colouring by IGH SHM level
  scale_color_viridis_c(name = "VH SHM %") + #using the continous viridis for the colour
  theme_custom() +
  labs(x = "UMAP_1",
       y = "UMAP_2",
       title = "PBMC IGHV SHM",
       caption = "non B cells in gray") #labels for plot
plot(pbmc.shm.umap.p1)

tumour.shm.umap.p1 <- ggplot(tumour.seurat.data, aes(x = umap_1, y = umap_2)) +
  geom_point(data = tumour.seurat.data %>% filter(vdj_cell_type != "B"),
             size = 0.5,
             colour = "lightgray") + #plotting cells that are NOT classed as B cell from VDJ on the bottom layer of the plot with a smaller point size
  geom_point(data = tumour.seurat.data %>% filter(vdj_cell_type == "B"),
             size = 1,
             aes(colour = vh_mut)) + #plotting cells that are classed as B cells on top layer and colouring by IGH SHM level
  scale_color_viridis_c(name = "VH SHM %") + #using the continuous viridis for the colour
  theme_custom() +
  labs(x = "UMAP_1",
       y = "UMAP_2",
       title = "tumour IGHV SHM",
       caption = "non B cells in gray") #labels for plot
plot(tumour.shm.umap.p1)

Splitting plots across sub-panels is simple using the facet_wrap() or facet_grid() from ggplot2:

#only plotting the B cells, but splitting by isotype
pbmc.shm.umap.p2 <- ggplot(pbmc.seurat.data %>% filter(vdj_cell_type == "B"), aes(x = umap_1, y = umap_2)) +
  geom_point(size = 1,
             aes(colour = vh_mut)) + 
  scale_color_viridis_c(name = "VH SHM %") + #using the continuous viridis for the colour
  theme_custom() +
  labs(x = "UMAP_1",
       y = "UMAP_2",
       title = "PBMC IGHV SHM",
       caption = "only showing cells with Ig VDJ") + #labels for plot
  scale_y_continuous(limits = c(-20, 15)) +
  scale_x_continuous(limits = c(-15, 10)) +
  facet_wrap(~ isotype, nrow = 2, scales = "free") #splitting by the isotype subclass
#plot(pbmc.shm.umap.p2)

tumour.shm.umap.p2 <- ggplot(tumour.seurat.data %>% filter(vdj_cell_type == "B"), aes(x = umap_1, y = umap_2)) +
  geom_point(size = 1,
             aes(colour = vh_mut)) + 
  scale_color_viridis_c(name = "VH SHM %") + #using the continuous viridis for the colour
  theme_custom() +
  labs(x = "UMAP_1",
       y = "UMAP_2",
       title = "tumour IGHV SHM",
       caption = "only showing cells with Ig VDJ") + #labels for plot
  scale_y_continuous(limits = c(-20, 15)) +
  scale_x_continuous(limits = c(-15, 25)) +
  facet_wrap(~ isotype, nrow = 2, scales = "free") #splitting by the isotype subclass
#plot(tumour.shm.umap.p2)

plot(ggarrange(pbmc.shm.umap.p2, tumour.shm.umap.p2, nrow = 2))

We can build more complex panels using ggplot2 and ggpubr. For example, visualising the SHM of the B cells on UMAPs and displaying with a quantification of the SHM% for clones for each isotype with statistics.

Start with a summary of the SHM for each cell for each isotype in the two datasets:

## isotype order by chromosome location
isotype_order <- c("IGHM", "IGHD", "IGHG3", "IGHG1", "IGHA1", "IGHG2", "IGHG4", "IGHE", "IGHA2")

##combining the datasets and adding the isotype levels
seurat.data <- bind_rows(pbmc.seurat.data %>% mutate(dataset = "PBMC"),
                         tumour.seurat.data %>% mutate(dataset = "tumour")) %>%
  mutate(isotype_fac = factor(isotype, isotype_order))

#SHM summary
shm.summary.p1 <- ggplot(seurat.data %>% filter(vdj_cell_type == "B"), 
                         aes(x = isotype_fac, y = vh_mut)) +
  geom_boxplot(aes(group = interaction(dataset, isotype_fac)),
               position = position_dodge(width = 0.9)) +
  geom_point(aes(colour = dataset, group = dataset),
             position = position_dodge(width = 0.9)) +
  theme_x_rotated() +
  labs(x = "isotype subclass",
       y = "VH SHM %") +
  scale_colour_aaas() +
  guides(colour = guide_legend(override.aes = list(size = 5)))
plot(shm.summary.p1)

Adding p-values to determine if there is any difference in the mean SHM between the PBMC and tumour B cells for the different isotypes. This can be done with the rstatix package which offers a variety of different tests and then formats the output for plotting on ggplot2 with ggpubr functions. This call is a little more complex because of how the data is grouped:

#adding stats for difference between isotype SHM means
shm.summary.stats <- seurat.data %>% 
  filter(vdj_cell_type == "B") %>% #only using data from B cells by VDJ
  filter(isotype_fac %in% c("IGHM", "IGHD", "IGHG3", "IGHG1", "IGHA1", "IGHG2", "IGHA2")) %>% #not all isotype have necessary data points, so restrict
  group_by(isotype_fac) %>% #want to perform the tests for each isotype
  wilcox_test(vh_mut ~ dataset) %>% #do th wilcoxon test for difference in means between the datasets
  ungroup() %>% #remove the grouping by isotype
  adjust_pvalue(method = "bonferroni") %>% # do the p-value adjustment
  add_xy_position(x = "isotype_fac", dodge = 0.9) %>% #add the X & y-positions that are needed for plotting
  mutate(group1 = isotype_fac,
         group2 = isotype_fac) #adjust groups to get values in correct position on panel

#print
shm.summary.stats %>% head()
## # A tibble: 6 × 14
##   isotype_fac .y.    group1 group2    n1    n2 statistic       p  p.adj
##   <fct>       <chr>  <fct>  <fct>  <int> <int>     <dbl>   <dbl>  <dbl>
## 1 IGHM        vh_mut IGHM   IGHM     731  3713  1355168. 0.927   1     
## 2 IGHD        vh_mut IGHD   IGHD      12   345     2072. 0.99    1     
## 3 IGHG3       vh_mut IGHG3  IGHG3      5    45      133  0.518   1     
## 4 IGHG1       vh_mut IGHG1  IGHG1     32   240     4992. 0.00585 0.0410
## 5 IGHA1       vh_mut IGHA1  IGHA1     30   197     3514  0.0956  0.669 
## 6 IGHG2       vh_mut IGHG2  IGHG2     15   168     1394. 0.495   1     
## # ℹ 5 more variables: y.position <dbl>, groups <named list>, x <dbl>,
## #   xmin <dbl>, xmax <dbl>

Can now add the p-values to the panel using the stat_pvalue_manual() function by specifying the tibble that contains out test results (have adjusted to only display p-values that are significant):

#add these to the panel using ggpubr function stat_pvalue_manual
shm.summary.p2 <- ggplot(seurat.data %>% filter(vdj_cell_type == "B"), 
                         aes(x = isotype_fac, y = vh_mut)) +
  geom_boxplot(aes(group = interaction(dataset, isotype_fac)),
               position = position_dodge(width = 0.9)) +
  geom_point(aes(colour = dataset, group = dataset),
             position = position_dodge(width = 0.9)) +
  stat_pvalue_manual(data = shm.summary.stats, label.size = 3,
                     hide.ns = TRUE) +
  theme_x_rotated() +
  labs(x = "isotype subclass",
       y = "VH SHM %",
       caption = "p-values: Wilcoxon, bonf adjusted") +
  scale_colour_aaas() +
  guides(colour = guide_legend(override.aes = list(size = 5)))
plot(shm.summary.p2)

Finally, can use ggarrange() from ggpubr to combine the panels into a single figure panel:

plot(ggarrange(ggarrange(pbmc.shm.umap.p1, tumour.shm.umap.p1, nrow = 1),
               shm.summary.p2, nrow = 2, ncol = 1))

Exploring the extent of clone/clonotype expansion in the different datasets by building up a figure panel. Starting with a UMAP showing the expanded and singleton clones for the samples:

exp.umap.p1 <- ggplot(seurat.data, aes(x = umap_1, y = umap_2)) +
  geom_point(data = seurat.data %>% filter(is.na(vdj_cell_type)),
             size = 0.5, colour = "lightgray") + #bottom layer are the cell that don't have any VDJ
  geom_point(data = seurat.data %>% filter(vdj_cell_type == "B", b_expanded_cln != "expanded"),
             size = 0.8,
             aes(colour = b_expanded_cln)) + #B cells, singleton
  geom_point(data = seurat.data %>% filter(vdj_cell_type == "B", b_expanded_cln == "expanded"),
             size = 0.8,
             aes(colour = b_expanded_cln)) + #B cells, expanded
  geom_point(data = seurat.data %>% filter(vdj_cell_type == "T", t_expanded_cln != "expanded"),
             aes(colour = t_expanded_cln),
             size = 0.8) + #T cells, singleton
  geom_point(data = seurat.data %>% filter(vdj_cell_type == "T", t_expanded_cln == "expanded"),
             aes(colour = t_expanded_cln),
             size = 0.8) + #T cells, expanded
  theme_custom() +
  facet_grid(dataset ~ vdj_cell_type) +
  scale_colour_npg(name = "expansions") +
  guides(colour = guide_legend(override.aes = list(size = 5)))
#exp.umap.p1

Next make stacked barplot to summarise the proportion of clones that are expanded for B cells in each sample, along with the % of cells that are in expansions versus singleton clones:

##summarise
b.expanded.clns.summary <- b.data %>%
  group_by(dataset, b_expanded_cln) %>%
  summarise(cell_cnt = length(barcode),
            cln_cnt = length(unique(unq_clone_id))) %>%
  ungroup() %>%
  group_by(dataset) %>%
  mutate(cell_prop = cell_cnt / sum(cell_cnt),
         cln_prop = cln_cnt / sum(cln_cnt)) %>%
  ungroup()
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
b.expanded.clns.summary %>% head()
## # A tibble: 4 × 6
##   dataset b_expanded_cln cell_cnt cln_cnt cell_prop cln_prop
##   <chr>   <chr>             <int>   <int>     <dbl>    <dbl>
## 1 pbmc    expanded             12       3    0.0143  0.00361
## 2 pbmc    singleton           828     828    0.986   0.996  
## 3 tumour  expanded            182      26    0.0371  0.00547
## 4 tumour  singleton          4723    4723    0.963   0.995
b.exp.p1 <- ggplot(b.expanded.clns.summary, aes(x = dataset, y = cln_prop * 100)) +
  geom_bar(stat = "identity", position = "stack",
           aes(fill = b_expanded_cln)) +
  scale_fill_npg(name = "expanded clone") +
  theme_x_rotated() +
  theme(legend.position = "bottom") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  labs(y = "% clones")
#plot(b.exp.p1)

b.exp.p2 <- ggplot(b.expanded.clns.summary, aes(x = dataset, y = cell_prop * 100)) +
  geom_bar(stat = "identity", position = "stack",
           aes(fill = b_expanded_cln)) +
  scale_fill_npg(name = "expanded clone") +
  theme_x_rotated() +
  theme(legend.position = "bottom") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  labs(y = "% cells")
#plot(b.exp.p2)

plot(ggarrange(b.exp.p1, b.exp.p2, nrow = 1))

Summarising for the T cell clonotypes:

t.expanded.clns.summary <- t.data %>%
  group_by(dataset, t_expanded_cln) %>%
  summarise(cell_cnt = length(barcode),
            cln_cnt = length(unique(paired_clonotype))) %>%
  ungroup() %>%
  group_by(dataset) %>%
  mutate(cell_prop = cell_cnt / sum(cell_cnt),
         cln_prop = cln_cnt / sum(cln_cnt)) %>%
  ungroup()
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
t.expanded.clns.summary %>% head()
## # A tibble: 4 × 6
##   dataset t_expanded_cln cell_cnt cln_cnt cell_prop cln_prop
##   <chr>   <chr>             <int>   <int>     <dbl>    <dbl>
## 1 pbmc    expanded            125      46    0.0247  0.00922
## 2 pbmc    singleton          4943    4943    0.975   0.991  
## 3 tumour  expanded             45      18    0.0264  0.0107 
## 4 tumour  singleton          1661    1661    0.974   0.989
t.exp.p1 <- ggplot(t.expanded.clns.summary, aes(x = dataset, y = cln_prop * 100)) +
  geom_bar(stat = "identity", position = "stack",
           aes(fill = t_expanded_cln)) +
  scale_fill_npg(name = "expanded clone") +
  theme_x_rotated() +
  theme(legend.position = "bottom") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  labs(y = "% clonotypes")
#plot(t.exp.p1)

t.exp.p2 <- ggplot(t.expanded.clns.summary, aes(x = dataset, y = cell_prop * 100)) +
  geom_bar(stat = "identity", position = "stack",
           aes(fill = t_expanded_cln)) +
  scale_fill_npg(name = "expanded clone") +
  theme_x_rotated() +
  theme(legend.position = "bottom") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  labs(y = "% cells")
#plot(t.exp.p2)

plot(ggarrange(t.exp.p1, t.exp.p2, nrow = 1))

We have the UMAP and the stacked bar plot to show the proportions of expanded and singletons, but within the expansions we want to get a visual on how many cells there are in each clone or clonotype. To do this we will make a stacked bar chart that shows the number of cells for each clone/clonotype.

Starting with the B cell expanded clones:

##stacked bar charts for the expanded clones
b.cln.size.summary <- seurat.data %>%
  filter(vdj_cell_type == "B") %>%
  mutate(label = ifelse(b_expanded_cln == "expanded", paste0(unq_clone_id), "singletons")) %>%
  group_by(dataset, label) %>%
  summarise(cell_cnt = length(barcode)) %>%
  ungroup() %>%
  group_by(dataset) %>%
  mutate(cell_prop = cell_cnt / sum(cell_cnt)) %>%
  ungroup()
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
b.cln.size.summary %>% head()
## # A tibble: 6 × 4
##   dataset label       cell_cnt cell_prop
##   <chr>   <chr>          <int>     <dbl>
## 1 PBMC    pbmc_724           2  0.00240 
## 2 PBMC    pbmc_952           2  0.00240 
## 3 PBMC    singletons       828  0.995   
## 4 tumour  singletons      4723  0.988   
## 5 tumour  tumour_1032        2  0.000418
## 6 tumour  tumour_1039        2  0.000418
b.cln.sizes.p1 <- ggplot(b.cln.size.summary %>% filter(label != "singletons"), 
                         aes(x = dataset, y = cell_cnt, group = cell_cnt)) +
  geom_bar(stat = "identity", 
           position = position_stack(),
           aes(fill = factor(label)),
           linewidth = 0.05, colour = "gray") +
  scale_fill_manual(values = colorRampPalette(pal_d3("category20")(20))(40),
                    name = "clone") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  theme_x_rotated() +
  theme(legend.position = "none") +
  labs(y = "# cells")
plot(b.cln.sizes.p1)

For the T cell expanded clonotypes:

##stacked bar charts for the expanded clonotypes
t.cln.size.summary <- seurat.data %>%
  filter(vdj_cell_type == "T") %>%
  mutate(label = ifelse(t_expanded_cln == "expanded", paste0(paired_clonotype), "singletons")) %>%
  group_by(dataset, label) %>%
  summarise(cell_cnt = length(barcode)) %>%
  ungroup() %>%
  group_by(dataset) %>%
  mutate(cell_prop = cell_cnt / sum(cell_cnt)) %>%
  ungroup()
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
t.cln.size.summary %>% head()
## # A tibble: 6 × 4
##   dataset label                                               cell_cnt cell_prop
##   <chr>   <chr>                                                  <int>     <dbl>
## 1 PBMC    TRAV1-2_TRAJ33_AVMDSNYQLI                                  5  0.000987
## 2 PBMC    TRBV12-4_TRBJ2-1_ASSLLFGLAGVYNEQF:TRAV9-2_TRAJ28_A…        2  0.000395
## 3 PBMC    TRBV12-4_TRBJ2-3_ASSLALARSTDTQY                            2  0.000395
## 4 PBMC    TRBV12-4_TRBJ2-6_ASSPPRGSSGANVLT:TRAV21_TRAJ56_AVF…        2  0.000395
## 5 PBMC    TRBV18_TRBJ2-1_ASSPPGANEQF:TRAV20_TRAJ4_AVPSFSGGYN…        3  0.000592
## 6 PBMC    TRBV19_TRBJ1-5_ASSIKGGHQPQH:TRAV24_TRAJ45_AFIPGGGA…        3  0.000592
t.cln.sizes.p1 <- ggplot(t.cln.size.summary %>% filter(label != "singletons"), 
                         aes(x = dataset, y = cell_cnt, group = cell_cnt)) +
  geom_bar(stat = "identity", 
           position = position_stack(),
           aes(fill = factor(label)),
           linewidth = 0.05, colour = "gray") +
  scale_fill_manual(values = colorRampPalette(pal_d3("category20")(20))(70),
                    name = "clone") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
  theme_x_rotated() +
  theme(legend.position = "none") +
  labs(y = "# cells")
plot(t.cln.sizes.p1)

Finally, build it all into a single panel to summarise the expansions across the two datasets:

## turning off the legends as will be provided by the UMAP but adding a title to each for cell type
b.exp.panel <- ggarrange(b.exp.p1, b.exp.p2, common.legend = TRUE, legend = "none")
b.exp.panel <- annotate_figure(b.exp.panel, top = text_grob("B cells", color = "black", face = "bold", size = 14))

t.exp.panel <- ggarrange(t.exp.p1, t.exp.p2, common.legend = TRUE, legend = "none")
t.exp.panel <- annotate_figure(t.exp.panel, top = text_grob("T cells", color = "black", face = "bold", size = 14))

b.cln.sizes.p1 <- annotate_figure(b.cln.sizes.p1, top = text_grob("B expanded", color = "black", face = "bold", size = 14))
t.cln.sizes.p1 <- annotate_figure(t.cln.sizes.p1, top = text_grob("T expanded", color = "black", face = "bold", size = 14))

#combining the bar plots and the UMAP with the stacked bar charts for clonotype distribution
plot(ggarrange(exp.umap.p1 + theme(legend.position = "top"), 
               ggarrange(b.exp.panel, t.exp.panel, nrow = 2), 
               ggarrange(b.cln.sizes.p1, t.cln.sizes.p1, nrow = 2),
               nrow = 1, widths = c(4, 2, 1)))

Visualising clusters and other features simulatanesouly

One challenge when using data point fill/colour to encode a VDJ feature like SHM is that it becomes hard to track the clusters on the UMAP. Hull plots can be used to automate the generation of outlines for the clusters. To use hull plots need to load the ggforce package:

if (sum(grepl("ggforce", rownames(installed.packages()))) > 0) {
  print("ggforce is already installed")
} else {
  install.packages("ggforce")
  install.packages("concaveman")
}
## [1] "ggforce is already installed"
library(ggforce)

For the hull plots to make reasonable representation of the clusters it is sometime necessary to exclude cells that are outliers from the main density of points for the cluster. This will vary between datasets, so may need to experiment with different values for outlier exclusions. Here using the median UMAP1 + UMAP2 coordinates for clusters and a +/- of 2.2:

clst.outliers <- seurat.data %>% 
  group_by(dataset, seurat_clusters) %>%
  mutate(median_umap_1 = median(umap_1),
         median_umap_2 = median(umap_2)) %>%
  ungroup() %>%
  mutate(cluster_outlier = case_when((umap_1 < (median_umap_1 - 2.2)) | (umap_1 > (median_umap_1 + 2.2)) ~ TRUE,
                                     (umap_2 < (median_umap_2 - 2.2)) | (`umap_2` > (median_umap_2 + 2.2)) ~ TRUE,
                                     TRUE ~ FALSE))

Here is the UMAP with the data points coloured by Seurat cluster:

umap.clst.p1 <- ggplot(seurat.data, aes(x = umap_1, y = umap_2, color = factor(seurat_clusters))) +
  geom_point(size = 0.2) +
  scale_color_d3(palette = "category20",
                 name = "") +
  theme_custom() +
  facet_wrap(~ dataset) +
  guides(colour = guide_legend(override.aes = list(size = 5)))
umap.clst.p1

The result of adding cluster outlines using the hull plots:

##points + hull plot for cluster regions
umap.clst.p2 <- ggplot(seurat.data, aes(x = umap_1, y = umap_2, color = factor(seurat_clusters))) +
  geom_point(size = 0.4) +
  geom_mark_hull(data = clst.outliers %>% filter(!cluster_outlier),
                 aes(fill = factor(seurat_clusters)), concavity = 3, expand = 0.01, radius = 0.01) +
  scale_color_d3(palette = "category20",
                 name = "") +
  scale_fill_d3(palette = "category20",
                name = "") +
  theme_custom() +
  facet_wrap(~ dataset) +
  guides(colour = guide_legend(override.aes = list(size = 5)))
#umap.clst.p2

plot(ggarrange(umap.clst.p1, umap.clst.p2, nrow = 2))

Keeping the hull outline, but showing expanded clones using the data point colouring:

p1 <- ggplot(seurat.data, aes(x = umap_1, y = umap_2)) +
  geom_point(size = 0.8, shape = 21, 
             aes(fill = vh_mut),
             stroke = NA) +
  geom_mark_hull(data = clst.outliers %>% filter(!cluster_outlier),
                 aes(colour = factor(seurat_clusters)), 
                 concavity = 3, expand = 0.01, radius = 0.01,
                 linetype = "dashed", linewidth = 1) +
  scale_fill_viridis_c(name = "VH SHM%") +
  scale_colour_d3(palette = "category20",
                  name = "Cluster") +
  theme_custom() +
  facet_wrap(~ dataset) +
  guides(colour = guide_legend(override.aes = list(size = 5)))
plot(p1)

Saving to output files

As the B and T cell data have been filtered and updated save new versions:

#save the tibble that include the seurat with VDJ
write_tsv(seurat.data, "../data/pbmc-tumour_seruat_data.tsv")

##save the b and t cell data that has the additional columns and filtering
write_tsv(b.data, "../data/pbmc-tumour_b_cells_vdj.tsv")
write_tsv(t.data, "../data/pbmc-tumour_t_cells_vdj.tsv")

Adding metadata to Seurat direct from Cell Ranger VDJ

If you prefer not to post-process data and to use the Cell Ranger outputs directly here is how to add this meta.data to a Suerat object.

The Seurat objects have been unloaded, so need to reload:

pbmc.seurat.obj <- readRDS("../seurat_objects/pbmc_seurat-without-VDJ-genes-azimuth.rds")

Grab the filtered_contig_annotations.csv file from Cell Ranger:

pbmc.b.cellranger <- read_csv("https://cf.10xgenomics.com/samples/cell-vdj/5.0.0/sc5p_v2_hs_PBMC_10k/sc5p_v2_hs_PBMC_10k_b_filtered_contig_annotations.csv")
## Rows: 2066 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): barcode, contig_id, chain, v_gene, d_gene, j_gene, c_gene, cdr3, c...
## dbl  (3): length, reads, umis
## lgl  (4): is_cell, high_confidence, full_length, productive
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pbmc.t.cellranger <- read_csv("https://cf.10xgenomics.com/samples/cell-vdj/5.0.0/sc5p_v2_hs_PBMC_10k/sc5p_v2_hs_PBMC_10k_t_filtered_contig_annotations.csv")
## Rows: 10817 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): barcode, contig_id, chain, v_gene, d_gene, j_gene, c_gene, cdr3, c...
## dbl  (3): length, reads, umis
## lgl  (4): is_cell, high_confidence, full_length, productive
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

The Cell Ranger data is per contig, need to change to per cell barcode before adding to Seurat object. To do this, going to filter T and B for contigs to the top chains by UMI for IGH + IGK/L and TRA + TRB, respectively:

pbmc.b.cellranger.reformat <- pbmc.b.cellranger %>%
  filter(productive == TRUE) %>%
  mutate(chain_grp = ifelse(chain == "IGH", "igh", "igkl")) %>%
  group_by(barcode, chain_grp) %>%
  mutate(chain_rnk = row_number(desc(umis))) %>%
  ungroup() %>%
  filter(chain_rnk == 1) %>%
  pivot_wider(id_cols = c("barcode", "is_cell"),
              names_from = "chain_grp",
              values_from = -c("barcode", "is_cell", "chain_grp")) %>%
  mutate(is_paired_ig = ifelse(!(is.na(v_gene_igh)) & !(is.na(v_gene_igkl)), TRUE, FALSE)) 

pbmc.t.cellranger.reformat <- pbmc.t.cellranger %>%
  filter(productive == TRUE) %>%
  mutate(chain_grp = ifelse(chain == "TRA", "tra", "trb")) %>%
  group_by(barcode, chain_grp) %>%
  mutate(chain_rnk = row_number(desc(umis))) %>%
  ungroup() %>%
  filter(chain_rnk == 1) %>%
  pivot_wider(id_cols = c("barcode", "is_cell"),
              names_from = "chain_grp",
              values_from = -c("barcode", "is_cell", "chain_grp")) %>%
  mutate(is_paired_tr = ifelse(!(is.na(v_gene_tra)) & !(is.na(v_gene_trb)), TRUE, FALSE)) 

Join the T and B cell data and remove any cells that have data for both B and T:

pbmc.cellranger.data <- pbmc.b.cellranger.reformat %>%
  full_join(pbmc.t.cellranger.reformat, by = c("barcode", "is_cell")) %>%
  mutate(dual_cell = ifelse(!(is.na(is_paired_ig)) & !(is.na(is_paired_tr)), TRUE, FALSE)) %>%
  filter(dual_cell == FALSE) %>%
  mutate(vdj_cell_type = case_when(is.na(is_paired_ig) ~ "T",
                                   is.na(is_paired_tr) ~ "B"))

table(pbmc.cellranger.data$vdj_cell_type)
## 
##    B    T 
##  886 5232

Now, format as a data.frame with the cell barcodes as the row names:

pbmc.cellranger.dataframe <- as.data.frame(pbmc.cellranger.data)
rownames(pbmc.cellranger.dataframe) <- pbmc.cellranger.dataframe$barcode
pbmc.cellranger.dataframe <- pbmc.cellranger.dataframe %>% select(-c(barcode))

Finally, can add the metadata to the Seurat object:

pbmc.seurat.obj <- AddMetaData(pbmc.seurat.obj, pbmc.cellranger.dataframe)

colnames(pbmc.seurat.obj@meta.data)
##  [1] "orig.ident"            "nCount_RNA"            "nFeature_RNA"         
##  [4] "nCount_SCT"            "nFeature_SCT"          "SCT_snn_res.0.8"      
##  [7] "seurat_clusters"       "SCT_snn_res.0.7"       "SCT_snn_res.0.6"      
## [10] "SCT_snn_res.0.5"       "SCT_snn_res.0.4"       "SCT_snn_res.0.3"      
## [13] "SCT_snn_res.0.2"       "azimuth_l1"            "azimuth_l2"           
## [16] "azimuth_l3"            "is_cell"               "contig_id_igkl"       
## [19] "contig_id_igh"         "high_confidence_igkl"  "high_confidence_igh"  
## [22] "length_igkl"           "length_igh"            "chain_igkl"           
## [25] "chain_igh"             "v_gene_igkl"           "v_gene_igh"           
## [28] "d_gene_igkl"           "d_gene_igh"            "j_gene_igkl"          
## [31] "j_gene_igh"            "c_gene_igkl"           "c_gene_igh"           
## [34] "full_length_igkl"      "full_length_igh"       "productive_igkl"      
## [37] "productive_igh"        "cdr3_igkl"             "cdr3_igh"             
## [40] "cdr3_nt_igkl"          "cdr3_nt_igh"           "reads_igkl"           
## [43] "reads_igh"             "umis_igkl"             "umis_igh"             
## [46] "raw_clonotype_id_igkl" "raw_clonotype_id_igh"  "raw_consensus_id_igkl"
## [49] "raw_consensus_id_igh"  "chain_rnk_igkl"        "chain_rnk_igh"        
## [52] "is_paired_ig"          "contig_id_tra"         "contig_id_trb"        
## [55] "high_confidence_tra"   "high_confidence_trb"   "length_tra"           
## [58] "length_trb"            "chain_tra"             "chain_trb"            
## [61] "v_gene_tra"            "v_gene_trb"            "d_gene_tra"           
## [64] "d_gene_trb"            "j_gene_tra"            "j_gene_trb"           
## [67] "c_gene_tra"            "c_gene_trb"            "full_length_tra"      
## [70] "full_length_trb"       "productive_tra"        "productive_trb"       
## [73] "cdr3_tra"              "cdr3_trb"              "cdr3_nt_tra"          
## [76] "cdr3_nt_trb"           "reads_tra"             "reads_trb"            
## [79] "umis_tra"              "umis_trb"              "raw_clonotype_id_tra" 
## [82] "raw_clonotype_id_trb"  "raw_consensus_id_tra"  "raw_consensus_id_trb" 
## [85] "chain_rnk_tra"         "chain_rnk_trb"         "is_paired_tr"         
## [88] "dual_cell"             "vdj_cell_type"

Now the VDJ data is available for use using the group.by and split.by for many Seurat functions.

Return to main page.